This is my notebook that I will used to go through the tidy text notebook by Julia Silge and David Robinson.

library(tidyverse)
library(tidytext)
library(stringr)
library(janeaustenr)


text <- c("Because I could not stop for Death - ", 
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")

text
## [1] "Because I could not stop for Death - " 
## [2] "He kindly stopped for me -"            
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
text_df <- data_frame(line = 1:4, text = text)

text_df
text_df %>% 
  unnest_tokens(word, text)  # to_lower = FALSE to keep uppercase

Working with Jane Austen Books

Using the Jane Austen dataset we can make it tidy. First we’ll use mutate to create a column from existing data. Annotate line numbers and then keep track of chapters using regex.

original_books <- austen_books() %>% 
  group_by(book) %>% 
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                  ignore.case = TRUE)))) %>% 
  ungroup()

original_books

Restructure it in one-token-per-row format with the unnest_tokens() function.

tidy_books <- original_books %>% 
  unnest_tokens(word, text)

tidy_books

Removing stop words

We can remove stop words

data(stop_words)

tidy_books <- tidy_books %>% 
  anti_join(stop_words)
## Joining, by = "word"
tidy_books %>% 
  count(word, sort = TRUE)

Plotting data

tidy_books %>% 
  count(word, sort = TRUE) %>% 
  filter(n > 600) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n))+
  geom_col()+
  xlab(NULL)+
  coord_flip()+
  theme_bw()

#### Gutenbergr

HG wells books

library(gutenbergr)


hgwells <-  gutenberg_download(c(35, 36, 5230, 159))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_hgwells <- hgwells %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
tidy_hgwells %>% 
  count(word, sort = TRUE)

Most commone words in novels by Bronte sisters

bronte <-  gutenberg_download(c(1260, 768, 969, 9182, 767))

tidy_bronte <- bronte %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
tidy_bronte %>% 
  count(word, sort = TRUE)

Comparing Bronte and HG Wells

frequency <- bind_rows(mutate(tidy_bronte, author = "Bronte Sisters"),
                       mutate(tidy_hgwells, author = "H.G. Wells"),
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>% 
  count(author, word) %>% 
  group_by(author) %>% 
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(author, proportion) %>% 
  gather(author, proportion, `Bronte Sisters`:`H.G. Wells`)

Graphing the comparisons

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
ggplot(frequency, aes(x = proportion, y = `Jane Austen`,
                      color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word, check_overlap = TRUE, vjust = 1.5))+
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0,0.001),
                       low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position = "none") +
  labs(y = "Jane Austen", x = NULL)
## Warning: Ignoring unknown aesthetics: check_overlap
## Warning: Removed 41357 rows containing missing values (geom_point).
## Warning: Removed 41359 rows containing missing values (geom_text).

Correlation analysis

cor.test(data = frequency[frequency$author == "Bronte Sisters",],
         ~ proportion + `Jane Austen`)
## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Jane Austen
## t = 119.65, df = 10404, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7527846 0.7689620
## sample estimates:
##       cor 
## 0.7609915
cor.test(data = frequency[frequency$author == "H.G. Wells",],
         ~ proportion + `Jane Austen`)
## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Jane Austen
## t = 36.441, df = 6053, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4032800 0.4445987
## sample estimates:
##       cor 
## 0.4241601

Sentiments dataset

library(tidytext)

sentiments

Sentiment scores

# AFINN
get_sentiments("afinn")
# Bing
get_sentiments("bing")
# NRC
get_sentiments("nrc")

Sentiment analysis can be performed with an inner join

tidy_books <- austen_books() %>% 
  group_by(book) %>% 
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>% 
  ungroup() %>% 
  unnest_tokens(word, text)

tidy_books

Get a list of words that are associated with ‘joy’ and perform an inner_join to find words that are have ‘joy’ sentiment within Jane Austen’s Emma book.

# Filter words associated with 'joy'
nrcjoy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

# Find words associated with joy in Emma
tidy_books %>% 
  filter(book == "Emma") %>% 
  inner_join(nrcjoy) %>% 
  count(word, sort = TRUE)
## Joining, by = "word"

Count up positive or negative words that are within sections of each book. We will define an index to keep track of where we are in the narrative. The index will count up sections of 80 lines of text

janeaustensentiment <- tidy_books %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(book, index = linenumber %/% 80, sentiment) %>% 
  spread(sentiment, n, fill =0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
janeaustensentiment
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
afinn
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>% 
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive",
                                         "negative"))) %>% 
    mutate(method = "NRC")) %>% 
    count(method, index = linenumber %/% 80, sentiment) %>% 
    spread(sentiment, n, fill = 0) %>% 
    mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bing_and_nrc

Comparing three sentiment lexicons using Pride and Prejudice

bind_rows(afinn,
          bing_and_nrc) %>% 
  ggplot(aes(index, sentiment, fill = method)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~method, ncol = 1 , scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive",
                          "negative")) %>% 
  count(sentiment)
get_sentiments("bing") %>% 
  count(sentiment)
bing_words_counts <- tidy_books %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  ungroup()
## Joining, by = "word"
bing_words_counts
bing_words_counts %>% 
  group_by(sentiment) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE)+
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL,
       title = "Top 10 words by Sentiment for Pride and Prejudice") +
  coord_flip()
## Selecting by n

Miss is referred to as a negative word but is not in Jane Austen’s works.

custom_stop_words <- bind_rows(data_frame(word = c("miss"),
                                          lexicon = c("custom")),
                               stop_words)

custom_stop_words

Wordclouds

library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>% 
  anti_join(stop_words) %>% 
  count(word) %>% 
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  acast(word~sentiment, value.car = "n", fill = 0) %>% 
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"
## Using n as value column: use value.var to override.

Looking at Units Beyond Just Words

PandP_sentences <-  data_frame(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."

Figuring out how many chapters are in a book

austen_chapters <- austen_books() %>% 
  group_by(book) %>% 
  unnest_tokens(chapter, text , token = "regex",
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>% 
  ungroup()


austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>% 
  group_by(book, chapter) %>% 
  summarize(words = n())

tidy_books %>% 
  semi_join(bingnegative) %>% 
  group_by(book, chapter) %>% 
  summarize(negativewords = n()) %>% 
  left_join(wordcounts, by = c("book", "chapter")) %>% 
  mutate(ratio = negativewords/words) %>% 
  filter(chapter !=0) %>% 
  top_n(1) %>% 
  ungroup()
## Joining, by = "word"
## Selecting by ratio

# Chapter 3

Term Frequency in Jane Austen’s Novels

book_words <- austen_books() %>% 
  unnest_tokens(word,text) %>% 
  count(book,word,sort = TRUE) %>% 
  ungroup()

total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words

Term frequency distribution in Jane Austen’s novels

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).

Zipf’s Law

Zipf’s Law: frequency that a word appears is inversely proportional to its rank.

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(),
         'term frequency' = n/total)

freq_by_rank

Graph of Zipf’s law for Jane Austen’s novels

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
  scale_x_log10() +
  scale_y_log10()

rank_subset <-  freq_by_rank %>% 
  filter(rank <500,
         rank >10)


lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.6226      -1.1125
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book))+
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

The bind_tf_idf Function

book_words <- book_words %>% 
  bind_tf_idf(word, book, n)

book_words
book_words %>% 
  select(-total) %>% 
  arrange(desc(tf_idf))
book_words %>% 
  arrange(desc(tf_idf)) %>% 
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>% 
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2,scales = "free") +
  coord_flip()
## Selecting by tf_idf

A corpus of Physics Text

physics <- gutenberg_download(c(37729, 14725, 13476, 5001),
                              meta_fields = "author")
physics_words <- physics %>% 
  unnest_tokens(word, text) %>% 
  count(author, word, sort = TRUE) %>% 
  ungroup()

physics_words
author_order <- c("Galilei, Galileo", "Huygens, Christiaan", "Tesla, Nikola", "Einstein, Albert")

plot_physics <- physics_words %>% 
  bind_tf_idf(word, author, n) %>% 
  arrange(desc(tf_idf)) %>% 
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  mutate(author = factor(author, levels = author_order))

plot_physics
plot_physics %>% 
  group_by(author) %>%
  top_n(15, tf_idf) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, tf_idf)) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, ncol = 2, scales = "free") +
  coord_flip()

Isolating “eq” from Einstein’s text

physics %>% 
  filter(str_detect(text, "eq\\.")) %>% 
  select(text)

K1 was used for the coordinate system for Einstein

physics %>%
  filter(str_detect(text, "K1")) %>% 
  select(text)
physics %>%
  filter(str_detect(text, "AK")) %>% 
  select(text)
mystopwords <- data_frame(word = c("eq","co","rc","ac","ak","bn",
                                   "fig", "file", "cg", "cb", "cm"))

physics_words <- anti_join(physics_words, mystopwords, by = "word")

plot_physics <- physics_words %>% 
  bind_tf_idf(word, author, n) %>% 
  arrange(desc(tf_idf)) %>% 
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(15, tf_idf) %>% 
  ungroup() %>% 
  mutate(author = factor(author, levels = author_order))

plot_physics
ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs( x = NULL, y = "tf-idf") +
  facet_wrap(~author, ncol = 2, scales = "free") +
  coord_flip()

Chapter 4 Relationships Between Words: N-grams and Correlations

austen_bigrams <- austen_books() %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

austen_bigrams

Counting and Filtering N-grams

austen_bigrams %>% 
  count(bigram, sort = TRUE)
bigrams_separated <-  austen_bigrams %>% 
  separate(bigram, into = c("word1", "word2"), sep = " ")
  
bigrams_filtered <- bigrams_separated %>% 
  filter(!word1 %in% stop_words$word) %>% 
  filter(!word2 %in% stop_words$word)


bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
bigrams_united <- bigrams_filtered %>% 
  unite(bigram, word1, word2, sep = " ")

bigrams_united
austen_books() %>% 
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% 
  separate(trigram, into = c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>% 
  count(word1, word2, word3, sort = TRUE)

Analyzing Bigrams

Identifying the “streets” in each book.

bigrams_filtered %>% 
  filter(word2 == "street") %>% 
  count(book, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>% 
  count(book, bigram) %>% 
  bind_tf_idf(bigram, book, n) %>% 
  arrange(desc(tf_idf))

bigram_tf_idf 

The 12 bigrams with the highest tf-idf from each Jane Austen novel.

bigram_tf_idf %>% 
  arrange(desc(tf_idf)) %>% 
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(book) %>% 
  top_n(12,tf_idf) %>% 
  ungroup() %>% 
  ggplot(aes(x = bigram, y = tf_idf, fill = book))+
    geom_col(show.legend = FALSE) +
    facet_wrap(~book, ncol = 2, scales = "free")+
    coord_flip()

Using Bigrams to Provide Context in Sentiment Analysis

bigrams_separated %>% 
  filter(word1 == "not") %>% 
  count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn") 

AFINN
not_words <- bigrams_separated %>% 
  rename(word = word2) %>% 
  filter(word1 == "not") %>% 
  inner_join(AFINN) %>% 
  count(word, score, sort = TRUE) %>% 
  ungroup()
## Joining, by = "word"
not_words